import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters
import reviewminer as rm
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# download necessary resources from NLTK
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
2023-03-13 08:21:02.153584: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2023-03-13 08:21:03.114025: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory 2023-03-13 08:21:03.114087: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory 2023-03-13 08:21:03.114093: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. /tmp/ipykernel_33080/3131392801.py:6: DeprecationWarning: `import kerastuner` is deprecated, please use `import keras_tuner`. from kerastuner.tuners import RandomSearch [nltk_data] Downloading package brown to /home/kasyap/nltk_data... [nltk_data] Package brown is already up-to-date! [nltk_data] Downloading package punkt to /home/kasyap/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] /home/kasyap/nltk_data... [nltk_data] Package averaged_perceptron_tagger is already up-to- [nltk_data] date! [nltk_data] Downloading package stopwords to /home/kasyap/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to /home/kasyap/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package wordnet to /home/kasyap/nltk_data... [nltk_data] Package wordnet is already up-to-date!
df = pd.read_excel('Movie Review dataset1.xlsx')
df.head(100)
| review | SentimentValue | |
|---|---|---|
| 0 | Faces are slashed, throats are cut, blood squi... | Positive |
| 1 | Reasonably effective horror/science-fiction a ... | Negative |
| 2 | The villian in this movie is one mean sob and ... | Negative |
| 3 | This was one of the few shows that my wife and... | Negative |
| 4 | Sweet, rich valley girl develops crush on a pu... | Negative |
| ... | ... | ... |
| 95 | This is one of the worst movies I have ever se... | Positive |
| 96 | Camp Blood III is a vast improvement on Camp B... | Positive |
| 97 | My Santa Lucia Choir was chosen to be in this ... | Negative |
| 98 | Just finished watching 2FTM. The trailers intr... | Positive |
| 99 | Well, "Cube" (1997), Vincenzo's first movie, w... | Negative |
100 rows × 2 columns
def preprocess_text(text):
# convert text to lowercase
text = text.lower()
# remove numbers
text = re.sub(r'\d+', '', text)
# remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# tokenize the text
tokens = word_tokenize(text)
# remove stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token not in stop_words]
# lemmatize the tokens
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
# join the tokens back into a string
processed_text = ' '.join(lemmatized_tokens)
return processed_text
df['review'] = df['review'].apply(preprocess_text)
sample_rm = rm.ReviewMiner(reviews_df, id_column="Id", review_column='Text')
df
| review | SentimentValue | |
|---|---|---|
| 0 | face slashed throat cut blood squirt end three... | Positive |
| 1 | reasonably effective horrorsciencefiction la a... | Negative |
| 2 | villian movie one mean sob seems enjoy guess m... | Negative |
| 3 | one show wife agreed watching upset hear cance... | Negative |
| 4 | sweet rich valley girl develops crush punk all... | Negative |
| ... | ... | ... |
| 43559 | everyone seen movie feel responsibility tell u... | Positive |
| 43560 | spheeris debut must one best music documentary... | Negative |
| 43561 | film pretty confusing ludicrous plot awfulbut ... | Positive |
| 43562 | show average doesnt make laugh particularly ho... | Positive |
| 43563 | hard believe director barbet schroeder majesti... | Positive |
43564 rows × 2 columns
df["id"] = df.index + 1
sample_rm = rm.ReviewMiner(df, id_column="id", review_column='review')
sample_rm.one_time_analysis()
========= Popular aspects and opinions in the data ========= Aspect Opinion Extractor job begins: 2023-03-13 07:01:35.784673 There are in total 43564 comments. We will report progress every 4356 comments. 0.92 min later: finished 10.00% 1.8 min later: finished 20.00% 2.7 min later: finished 30.00% 3.5 min later: finished 40.00% 4.4 min later: finished 50.00% 5.3 min later: finished 59.99% 6.2 min later: finished 69.99% 7.0 min later: finished 79.99% 8.0 min later: finished 89.99% 8.8 min later: finished 99.99%
/home/kasyap/anaconda3/envs/py39/lib/python3.9/site-packages/reviewminer/aspect_opinion.py:339: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only. aspects_opinions_df = aspects_opinions_df[aspects_opinions_df['opinions'] != ""] \
========= Sentiment Analysis ========= Average sentiment score: 0.04 69.75% of the comments are positive,; 0.09% of the comments are neutral; 30.15% of the comments are negative
<Figure size 500x500 with 0 Axes>
/home/kasyap/anaconda3/envs/py39/lib/python3.9/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. trace_data = trace_data.append(trace_data.iloc[0])
df =df.drop(["id"],axis =1)
df["SentimentValue"].value_counts()
Positive 26550 Negative 17014 Name: SentimentValue, dtype: int64
import numpy as np
df['sentiment'] = np.where((df['SentimentValue'] =="Positive") , 1, 0)
df['sentiment'].value_counts()
1 26550 0 17014 Name: sentiment, dtype: int64
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], random_state=0)
# convert the text data into a matrix of token counts
vectorizer = CountVectorizer(stop_words='english')
X_train_matrix = vectorizer.fit_transform(X_train)
X_test_matrix = vectorizer.transform(X_test)
# train a Naive Bayes classifier on the training data
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_matrix, y_train)
# make predictions on the test data and evaluate accuracy
y_pred = nb_classifier.predict(X_test_matrix)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Accuracy: 0.8550179046919475
print(classification_report((y_test),y_pred))
precision recall f1-score support
0 0.84 0.76 0.80 4178
1 0.86 0.91 0.89 6713
accuracy 0.86 10891
macro avg 0.85 0.84 0.84 10891
weighted avg 0.85 0.86 0.85 10891
cm = sklearn.metrics.confusion_matrix((y_test),(y_pred))
ax= plt.subplot()
sns.heatmap(cm.astype(int), annot=True,fmt='g', ax = ax); #annot=True to annotate cells
labels = ['Negative', 'Positive']
# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels');
ax.set_title('Confusion Matrix');
ax.xaxis.set_ticklabels(labels); ax.yaxis.set_ticklabels(labels);
#plt.savefig("./images/output8.png")
plt.show()
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], random_state=0)
# convert the text data into a matrix of TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english')
X_train_matrix = vectorizer.fit_transform(X_train)
X_test_matrix = vectorizer.transform(X_test)
# train an XGBoost classifier on the training data
params = {'objective': 'binary:logistic', 'eval_metric': 'error'}
dtrain = xgb.DMatrix(X_train_matrix, label=y_train)
xgb_classifier = xgb.train(params, dtrain)
# make predictions on the test data and evaluate accuracy
dtest = xgb.DMatrix(X_test_matrix)
y_pred = xgb_classifier.predict(dtest)
y_pred_binary = [int(round(x)) for x in y_pred]
accuracy = accuracy_score(y_test, y_pred_binary)
print("Accuracy:", accuracy)
Accuracy: 0.7813791203746212
print(classification_report((y_test),y_pred_binary))
precision recall f1-score support
0 0.75 0.64 0.69 4178
1 0.80 0.87 0.83 6713
accuracy 0.78 10891
macro avg 0.77 0.76 0.76 10891
weighted avg 0.78 0.78 0.78 10891
cm = sklearn.metrics.confusion_matrix((y_test),(y_pred_binary))
ax= plt.subplot()
sns.heatmap(cm.astype(int), annot=True,fmt='g', ax = ax); #annot=True to annotate cells
labels = ['Negative', 'Positive']
# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels');
ax.set_title('Confusion Matrix');
ax.xaxis.set_ticklabels(labels); ax.yaxis.set_ticklabels(labels);
#plt.savefig("./images/output8.png")
plt.show()
# create an XGBoost classifier object
from xgboost import XGBClassifier
clf = XGBClassifier()
from sklearn.model_selection import GridSearchCV
param_grid = {
'max_depth': [3, 5, 7],
'n_estimators': [50, 100, 150],
'learning_rate': [0.1, 0.01, 0.001]
}
# fit the vectorizer to the training data and transform both training and testing data
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
# create a GridSearchCV object
grid_search = GridSearchCV(clf, param_grid, cv=5)
# fit the GridSearchCV object to the training data
grid_search.fit(X_train_vec, y_train)
# print the best hyperparameters found
print('Best hyperparameters: ', grid_search.best_params_)
# predict on the testing data using the best model
y_pred = grid_search.predict(X_test_vec)
# calculate the accuracy score
accuracy = np.mean(y_pred == y_test)
print('Accuracy: ', accuracy)
Best hyperparameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150}
Accuracy: 0.8480396657790836
print(classification_report((y_test),y_pred))
precision recall f1-score support
0 0.81 0.78 0.80 4178
1 0.87 0.89 0.88 6713
accuracy 0.85 10891
macro avg 0.84 0.84 0.84 10891
weighted avg 0.85 0.85 0.85 10891
cm = sklearn.metrics.confusion_matrix((y_test),(y_pred))
ax= plt.subplot()
sns.heatmap(cm.astype(int), annot=True,fmt='g', ax = ax); #annot=True to annotate cells
labels = ['Negative', 'Positive']
# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels');
ax.set_title('Confusion Matrix');
ax.xaxis.set_ticklabels(labels); ax.yaxis.set_ticklabels(labels);
#plt.savefig("./images/output8.png")
plt.show()
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], random_state=0)
# load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# encode the text data using the tokenizer
X_train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
X_test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)
# convert the labels to numpy arrays
y_train = y_train.values.astype('int32')
y_test = y_test.values.astype('int32')
# load the pre-trained BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
# compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
# train the model
history = model.fit(x=X_train_encodings, y=y_train, batch_size=32, epochs=3, validation_split=0.1)
# make predictions on the test data and evaluate accuracy
y_pred = model.predict(X_test_encodings).logits.argmax(axis=-1)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
2023-03-13 07:57:23.774987: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected 2023-03-13 07:57:23.775003: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (kasyap-IdeaPad): /proc/driver/nvidia/version does not exist 2023-03-13 07:57:23.775205: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. All model checkpoint layers were used when initializing TFBertForSequenceClassification. Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], random_state=0)
# load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# encode the text data using the tokenizer
X_train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
X_test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)
# convert the labels to numpy arrays
y_train = y_train.values.astype('int32')
y_test = y_test.values.astype('int32')
# define the model-building function
def build_model(hp):
# load the pre-trained BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
# set the number of trainable BERT layers
num_bert_layers = hp.Int('num_bert_layers', min_value=1, max_value=12, step=1)
for layer in model.bert.encoder.layer[:num_bert_layers]:
layer.trainable = False
# compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=hp.Choice('learning_rate', values=[1e-3, 2e-5]), epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
return model
# define the hyperparameters to search over
tuner_hp = HyperParameters()
tuner_hp.Int('num_bert_layers', 1, 12, step=1)
tuner_hp.Choice('learning_rate', values=[1e-3, 2e-5])
# define the Keras Tuner RandomSearch object
tuner = RandomSearch(
build_model,
objective='val_accuracy',
max_trials=10,
directory='bert_sentiment_tuning',
project_name='sentiment_analysis')
# search for the best hyperparameters
tuner.search(x=X_train_encodings, y=y_train, epochs=3, validation_split=0.1)
# get the best hyperparameters and build the final model
best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]
model = build_model(best_hp)
# train the final model and evaluate accuracy on the test data
history = model.fit(x=X_train_encodings, y=y_train, batch_size=32, epochs=3, validation_split=0.1)
y_pred = model.predict(X_test_encodings).logits.argmax(axis=-1)
accuracy = accuracy_score(y_test, y_pred)
print("Best hyperparameters:", best_hp)
print("Accuracy:", accuracy)
Trial 2 Complete [00h 00m 31s] Best val_accuracy So Far: None Total elapsed time: 00h 01m 01s Search: Running Trial #3 Value |Best Value So Far |Hyperparameter 1 |? |num_bert_layers 0.001 |? |learning_rate
All model checkpoint layers were used when initializing TFBertForSequenceClassification. Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. IOPub data rate exceeded. The notebook server will temporarily stop sending output to the client in order to avoid crashing it. To change this limit, set the config variable `--NotebookApp.iopub_data_rate_limit`. Current values: NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec) NotebookApp.rate_limit_window=3.0 (secs)